In this notebook, we will train an Latend Dirichlet Allocation (LDA) model on tweets to learn a set of words which commonly appear together, hopefully corresponding to a topic. We will apply the LDA training on the whole corpus of our tweets and extract 10 topics. Additionally, we will visualize the results using the pyLDAvis library.
Following, we will take these results to a different notebook for analysis. There, we will assign a topic distribution on each tweet given the words used in it and we will sum the topic distributions of all tweets corresponding to a state to conclude to the topic distribution per state.
In [2]:
from pymongo import MongoClient
import json
client = MongoClient()
db = client.Twitter
In [3]:
import pandas as pd
import time
import re
from nltk.tokenize import RegexpTokenizer
import HTMLParser # In Python 3.4+ import html
import nltk
from nltk.corpus import stopwords
In [4]:
start_time = time.time()
#we are filtering out tweets of different languages and outside of the US
filter_query = {
"$and":[ {"place.country_code":"US"}, { "lang": "en" } ]
}
#we are keeping only our fields of interest
columns_query = {
'text':1,
'entities.hashtags':1,
'entities.user_mentions':1,
'place.full_name':1,
'place.bounding_box':1
}
tweets = pd.DataFrame(list(db.tweets.find(
filter_query,
columns_query
)#.limit()
)
)
elapsed_time = time.time() - start_time
print elapsed_time
In [5]:
#parse state variable
tweets['state'] = map(lambda place_dict: place_dict['full_name'][-2:] ,tweets['place'])
In [ ]:
In [6]:
tweets['state'].value_counts().head()
Out[6]:
In [7]:
# #for one state only
# state = 'CA'
# tweets = tweets[tweets['state']==state]
len(tweets)
Out[7]:
In [ ]:
In [8]:
def Clean(unescaped_tweet):
'''This function takes a tweet as input and returns a tokenizing list.'''
tokenizer = RegexpTokenizer(r'\w+')
cleaned_tweet_tokens = tokenizer.tokenize(unescaped_tweet.lower())
return cleaned_tweet_tokens
start_time = time.time() #Starts time
tweets['text'] = tweets['text'].apply(lambda tweet: re.sub(r"http\S+", "", tweet))
#########################################################
def trump_mention(tweet):
trump_count = 0
if ('trump' in tweet.lower()) or ('donald' in tweet.lower()):
return True
return False
tweets['Trump'] = tweets['text'].apply(lambda tweet: trump_mention(tweet))
##############################################################
#tweet mentions --->@
#tweet hashtags --->#
#create two column with the the hashtags and the mentions
tweets['mentions'] = tweets['text'].apply(lambda tweet: re.findall(r'\@\w+',tweet))
tweets['hashtags'] = tweets['text'].apply(lambda tweet: re.findall(r'\#\w+',tweet))
#remove hashtags and mentions
tweets['text'] = tweets['text'].apply(lambda tweet: re.sub(r"\@\w+" , "", tweet))
tweets['text'] = tweets['text'].apply(lambda tweet: re.sub(r"\#\w+" , "", tweet))
#remove the numbers from the text
tweets['text'] =tweets['text'].apply(lambda tweet: ''.join([i for i in tweet if not i.isdigit()]))
trump_count = 0
clinton_count =0
#remove the names and surnames of the two candidates
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"Trump" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"Clinton" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"Donald" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"Hillary" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"USA" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"amp" , "", tweet))
#tokenize the text and add an extra column
tweets['token'] = tweets['text'].apply(lambda tweet: Clean(tweet))
tweets['token'] = tweets['token'].apply(lambda x: list(set(x)-set(stopwords.words('english'))))
elapsed_time = time.time() - start_time #time ends
print elapsed_time
tweets.head()
Out[8]:
In [9]:
tweets.head()
Out[9]:
In [10]:
#test['tags'] = map(lambda tweet: map(lambda tweet: tweet['text'] , tweet['entities']['hashtags']) if tweet['entities']['hashtags'] != None else None, raw_tweet[:100])
#tweets['text'][9]
In [11]:
doc_complete = tweets['token'].tolist()
doc_complete[:2]
Out[11]:
In [12]:
import gensim
In [13]:
import pickle
In [14]:
import gensim
from gensim import corpora
# Creating the term dictionary of our courpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(doc_complete)
pickle.dump(dictionary, open( 'dictionary2.pickle', "wb" ) )
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_complete]
pickle.dump(doc_term_matrix, open( 'doc_term_matrix.pickle', "wb" ) )
In [ ]:
In [15]:
Lda = gensim.models.ldamulticore.LdaMulticore
In [ ]:
nr_topics = 10
nr_passes = 100
start_time = time.time()
# Creating the object for LDA model using gensim library
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=nr_topics, id2word = dictionary, passes=nr_passes)
elapsed_time = time.time() - start_time
In [18]:
print 'Topic modelling for', nr_topics,'topics,', nr_passes,'passes,',len(tweets),'tweets:','\ncomplete in',elapsed_time/60.,'minutes'
In [ ]:
# Runtimes:
# Florida (~4K) ~ 16 min on 10 topics, 300 passes
# CA (57K) - 48 min on 10 topics 300 passes
In [ ]:
# can we do it on the whole data -> take the topics and classify each tweet within them.
# then we have discrete sets with topics and words weights in each topic.
# so then isn't a tweet represented by the appropriate values?
In [ ]:
# Print 2 topics and describe then with 4 words.
topics = ldamodel.print_topics(num_topics=nr_topics, num_words=50)
i=0
for topic in topics:
print topic
print ""
i+=1
In [5]:
import pickle
In [7]:
nr_topics = 10
nr_passes = 100
In [8]:
state = 'allstates'
name = "trained models/lda/%s_%itopics_%ipasses.pickle"%(state,nr_topics,nr_passes)
print "Procceed to save model in:", name
In [ ]:
pickle.dump(ldamodel, open( name, "wb" ) )
In [9]:
#load
ldamodel = pickle.load(open(name,'rb'))
In [13]:
import time
In [10]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
In [14]:
#load the LDA results (model, dictionary and corpus)
start_time = time.time()
ldamodel = pickle.load(open('trained models/lda/allstates_10topics_100passes.pickle'))
dictandcorpus = pickle.load(open('trained models/lda/Dictionary.pickle'))
c = dictandcorpus[1]
d = dictandcorpus[0]
del dictandcorpus
elapsed_time = time.time() - start_time
print elapsed_time
In [ ]:
In [ ]:
In [18]:
data = pyLDAvis.gensim.prepare(ldamodel, c, d)
In [19]:
data
Out[19]:
In [ ]:
#save results as an html file
pyLDAvis.save_html(data, open('LDA topics.html','wb'))
In [ ]:
In [ ]: